library(foreign)
library(ggplot2)
library(quanteda)
library(readtext)

# moving average function. span is 3 units, so unit +/- 1 day
ma <- function(x, n = 3){filter(x, rep(1 / n, n), sides = 2)}

# set working directory
setwd("1_Media Content Analysis")

# Load Data ---------------------------------------------------------------
load("./data/text_corpus.rdata")

# Create Doc-Feature Matrix ---------------------------------------------------------------------
# tokenizing texts
tks <- tokens(med_corp, remove_symbols = T)
# keeping key compounds
tks <- tokens_compound(tks, pattern = phrase(c("Black people",
                                               "Black person",
                                               "African American",
                                               "African Americans",
                                               "Black American",
                                               "Black Americans",
                                               "George Floyd",
                                               "Derek Chauvin",
                                               "Breonna Taylor",
                                               "Michael Brown",
                                               "Eric Garner",
                                               "Jacob Blake",
                                               "Black Lives Matter",
                                               "Donald Trump",
                                               "Joe Biden",
                                               "Kamala Harris",
                                               "Bernie Sanders",
                                               "Vice President",
                                               "police violence",
                                               "criminal justice")))
# creating document feature matrix
med_dfm <- dfm(tks)

# checking first 5 terms
med_dfm[, 1:5]

# removing stopwords and preserving tokens with at least 2 characters
med_dfm <- dfm_remove(med_dfm, pattern = c(stopwords("english"),
                                           "ve","re","don", 
                                           "doesn", "ll",
                                           "didn", "wouldn", 
                                           "won", "isn", "wasn", "aren",
                                           "haven", "mustn", "shan",
                                           "couldn", "shouldn"),
                      min_nchar = 2)

# select only documents mentioning race-related terms
race_terms <- c("black_people", "blacks",
                "black_person",
                #"black", # add even if false positive?
                "black_american", "black_americans",
                # "racial", "racism",
                "african_american", "african_americans",
                "african-american", "african-americans")
race_dfm <- dfm_select(med_dfm, pattern = race_terms)
#indices for docs with mentions
race_indx <- race_dfm@i
race_indx <- race_indx + 1

# use index to subset DFM
race_dfm <- med_dfm[race_indx, ]
race_corp <- med_corp[race_indx, ]
# prop.table(table(docvars(race_dfm)$channel))

print(race_corp[1], max_nchar = 4000)

# * Backlash Coverage ----------------------------------------------------------------
race_counts <- table(docvars(race_dfm)$date_md, docvars(race_dfm)$channel)

# select only documents mentioning negative terms
neg_terms <- c("violent", "violenc", # made police violence compound so not included
               "riot", "rioter",
               "loot", "looter", 
               "crime", "crimin", 
               "mob")

neg_dfm <- dfm_select(dfm_wordstem(race_dfm), pattern = neg_terms, 
                      valuetype = "glob")
topfeatures(neg_dfm)
#indices for docs with mentions
neg_indx <- neg_dfm@i
# neg_indx <- neg_indx + 1

# use index to subset DFM
neg_dfm <- race_dfm[neg_indx, ]

print(race_corp[neg_indx[1]], max_nchar = 4000)

neg_counts <- table(docvars(neg_dfm)$date_md, docvars(neg_dfm)$channel)

# * Activism Coverage ----------------------------------------------------------------
race_counts <- table(docvars(race_dfm)$date_md, docvars(race_dfm)$channel)
# select only documents mentioning negative terms
pos_terms <- c("discrimin", "antidiscrimin", "discriminatori", "anti-discrimin",
               "racist", "racism", "anti-racist")
pos_dfm <- dfm_select(dfm_wordstem(race_dfm), pattern = pos_terms, 
                      valuetype = "glob")
topfeatures(pos_dfm)
#indices for docs with mentions
pos_indx <- pos_dfm@i

# use index to subset DFM
pos_dfm <- race_dfm[pos_indx, ]

print(race_corp[pos_indx[3]], max_nchar = 4000)

pos_counts <- table(docvars(pos_dfm)$date_md, docvars(pos_dfm)$channel)



# Total -------------------------------------------------------------------
race_counts <- table(docvars(race_dfm)$date_md, docvars(race_dfm)$channel)
# total counts
tot_counts <- table(docvars(med_dfm)$date_md, docvars(med_dfm)$channel)

tot_counts_prop <- tot_counts
tot_counts_prop[,1:2] <- 0
tot_counts_prop[which(rownames(tot_counts) %in% rownames(race_counts)),1:2] <- race_counts/tot_counts[which(rownames(tot_counts) %in% rownames(race_counts)),1:2]
tot_counts_prop[,1][which(is.na(tot_counts_prop[,1]))] <- 0
tot_counts_prop[,2][which(is.na(tot_counts_prop[,2]))] <- 0
# moving average
tot_counts_prop[1:145, "Fox"] <- ma(tot_counts_prop[1:145, "Fox"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "Fox"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "Fox"]) # post-average
tot_counts_prop[1:145, "MSNBC"] <- ma(tot_counts_prop[1:145, "MSNBC"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"]) # post-average
tot_counts_prop <- as.data.frame(tot_counts_prop)
names(tot_counts_prop) <- c("date", "channel", "Freq")
tot_counts_prop$date <- as.Date(tot_counts_prop$date)

tot_counts_prop$index <- rep(1:366, 2)

# Proportion of Coverage
prop_plot_ma  <- ggplot(tot_counts_prop, 
                        aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(tot_counts_prop, index < 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(tot_counts_prop, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Proportion",
       title = "Mentions as Share of Total Daily Discussion") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 20, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        legend.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
prop_plot_ma

# Counts
race_counts_df <- tot_counts
race_counts_df[,1:2] <- 0
race_counts_df[which(rownames(tot_counts) %in% rownames(race_counts)),1:2] <- race_counts
# Moving average
race_counts_df[1:145, "Fox"] <- ma(race_counts_df[1:145, "Fox"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "Fox"] <- ma(race_counts_df[146:nrow(race_counts_df), "Fox"]) # post-average
race_counts_df[1:145, "MSNBC"] <- ma(race_counts_df[1:145, "MSNBC"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "MSNBC"] <- ma(race_counts_df[146:nrow(race_counts_df), "MSNBC"]) # post-average

race_counts_df <- as.data.frame(race_counts_df)
names(race_counts_df) <- c("date", "channel", "Freq")
race_counts_df$date <- as.Date(race_counts_df$date)

race_counts_df$index <- rep(1:366, 2)


count_plot_ma <- ggplot(race_counts_df, 
                        aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(race_counts_df, index < 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(race_counts_df, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Count", title = "Number of Mentions") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 20, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 14),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
count_plot_ma


# Exporting for STATA -----------------------------------------------------
count_df <- count_plot_ma$data
prop_df <- prop_plot_ma$data
count_df$df <- "counts"
prop_df$df <- "prop"
dat <- rbind(count_df, prop_df)

dat2 <- reshape(dat, idvar = c("date", "index", "df"), timevar = "channel", direction = "wide")

write.dta(dat2, file = "./data/media_frequency.dta")

# Summary Counts and Example Texts for Appendix ---------------------------------------------
# Requires running preceding cleaning file
# Full
ndoc(med_corp)
table(docvars(med_corp)$channel)

mean(summary(fx_c, ndoc(fx_c))$Sentences)
mean(summary(msnbc_c, ndoc(msnbc_c))$Sentences)
sd(summary(fx_c, ndoc(fx_c))$Sentences)
sd(summary(msnbc_c, ndoc(msnbc_c))$Sentences)

mean(summary(fx_c, ndoc(fx_c))$Tokens)
mean(summary(msnbc_c, ndoc(msnbc_c))$Tokens)
sd(summary(fx_c, ndoc(fx_c))$Tokens)
sd(summary(msnbc_c, ndoc(msnbc_c))$Tokens)

# Race
sum(race_counts)
colSums(race_counts)

set.seed(1693)
f_smp <- sample(1:ndoc(corpus_subset(race_corp, channel == "Fox")), 8)
m_smp <- sample(1:ndoc(corpus_subset(race_corp, channel == "MSNBC")), 8)

print(corpus_subset(race_corp, channel == "Fox")[c(f_smp)], max_nchar = 5000, max_ndoc = 8)
print(corpus_subset(race_corp, channel == "MSNBC")[c(m_smp)], max_nchar = 5000, max_ndoc = 8)


# Frames ------------------------------------------------------------------

# * Activism -------------------------------------------------------------
tot_counts_prop <- tot_counts
tot_counts_prop[,1:2] <- 0
tot_counts_prop[which(rownames(race_counts) %in% rownames(pos_counts)),1:2] <- pos_counts/race_counts[which(rownames(race_counts) %in% rownames(pos_counts)),1:2]
tot_counts_prop[,1][which(is.na(tot_counts_prop[,1]))] <- 0
tot_counts_prop[,2][which(is.na(tot_counts_prop[,2]))] <- 0
# moving average
tot_counts_prop[1:145, "Fox"] <- ma(tot_counts_prop[1:145, "Fox"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "Fox"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "Fox"]) # post-average
tot_counts_prop[1:145, "MSNBC"] <- ma(tot_counts_prop[1:145, "MSNBC"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"]) # post-average

tot_counts_prop <- as.data.frame(tot_counts_prop)
names(tot_counts_prop) <- c("date", "channel", "Freq")
tot_counts_prop$date <- as.Date(tot_counts_prop$date)
# tot_counts_prop <- rbind(tot_counts_prop, c("2020-03-31", "Fox", 0), c("2020-03-31", "MSNBC", 0))
tot_counts_prop$Freq <- as.numeric(tot_counts_prop$Freq)


tot_counts_prop$index <- rep(1:366, 2)


prop_plot_activ_ma  <- ggplot(tot_counts_prop, 
                              aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(tot_counts_prop, index < 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(tot_counts_prop, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Proportion", title = "Daily Proportion of Mentions\nof Blacks containing Frame per Network") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 16, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 14),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
prop_plot_activ_ma

race_counts_df <- tot_counts
race_counts_df[,1:2] <- 0
race_counts_df[which(rownames(race_counts) %in% rownames(pos_counts)),1:2] <- pos_counts
# Moving average
race_counts_df[1:145, "Fox"] <- ma(race_counts_df[1:145, "Fox"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "Fox"] <- ma(race_counts_df[146:nrow(race_counts_df), "Fox"]) # post-average
race_counts_df[1:145, "MSNBC"] <- ma(race_counts_df[1:145, "MSNBC"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "MSNBC"] <- ma(race_counts_df[146:nrow(race_counts_df), "MSNBC"]) # post-average


race_counts_df <- as.data.frame(race_counts_df)
names(race_counts_df) <- c("date", "channel", "Freq")
race_counts_df$date <- as.Date(race_counts_df$date)
# race_counts_df <- rbind(race_counts_df, c("2020-03-31", "Fox", 0), c("2020-03-31", "MSNBC", 0))
race_counts_df$Freq <- as.numeric(race_counts_df$Freq)


race_counts_df$index <- rep(1:366, 2)

count_plot_activ_ma <- ggplot(race_counts_df, 
                              aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(race_counts_df, index < 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(race_counts_df, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Count", title = "Daily Counts of Frames\nin Episodes per Network") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 16, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 14),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
count_plot_activ_ma


# * Backlash -------------------------------------------------------------
tot_counts_prop <- tot_counts
tot_counts_prop[,1:2] <- 0
tot_counts_prop[which(rownames(race_counts) %in% rownames(neg_counts)),1:2] <- neg_counts/race_counts[which(rownames(race_counts) %in% rownames(neg_counts)),1:2]
tot_counts_prop[,1][which(is.na(tot_counts_prop[,1]))] <- 0
tot_counts_prop[,2][which(is.na(tot_counts_prop[,2]))] <- 0
# moving average
tot_counts_prop[1:145, "Fox"] <- ma(tot_counts_prop[1:145, "Fox"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "Fox"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "Fox"]) # post-average
tot_counts_prop[1:145, "MSNBC"] <- ma(tot_counts_prop[1:145, "MSNBC"]) # pre-average
tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"] <- ma(tot_counts_prop[146:nrow(tot_counts_prop), "MSNBC"]) # post-average


tot_counts_prop <- as.data.frame(tot_counts_prop)
names(tot_counts_prop) <- c("date", "channel", "Freq")
tot_counts_prop$date <- as.Date(tot_counts_prop$date)
# tot_counts_prop <- rbind(tot_counts_prop, c("2020-03-31", "Fox", 0), c("2020-03-31", "MSNBC", 0))
tot_counts_prop$Freq <- as.numeric(tot_counts_prop$Freq)


tot_counts_prop$index <- rep(1:366, 2)


prop_plot_back_ma  <- ggplot(tot_counts_prop, 
                             aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(tot_counts_prop, index < 145), 
              aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(tot_counts_prop, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Proportion", title = "Daily Proportion of Mentions\nof Blacks containing Frame per Network") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 20, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 14),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
prop_plot_back_ma

race_counts_df <- tot_counts
race_counts_df[,1:2] <- 0
race_counts_df[which(rownames(race_counts) %in% rownames(neg_counts)),1:2] <- neg_counts
# Moving average
race_counts_df[1:145, "Fox"] <- ma(race_counts_df[1:145, "Fox"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "Fox"] <- ma(race_counts_df[146:nrow(race_counts_df), "Fox"]) # post-average
race_counts_df[1:145, "MSNBC"] <- ma(race_counts_df[1:145, "MSNBC"]) # pre-average
race_counts_df[146:nrow(race_counts_df), "MSNBC"] <- ma(race_counts_df[146:nrow(race_counts_df), "MSNBC"]) # post-average


race_counts_df <- as.data.frame(race_counts_df)
names(race_counts_df) <- c("date", "channel", "Freq")
race_counts_df$date <- as.Date(race_counts_df$date)
# race_counts_df <- rbind(race_counts_df, c("2020-03-31", "Fox", 0), c("2020-03-31", "MSNBC", 0))
race_counts_df$Freq <- as.numeric(race_counts_df$Freq)


race_counts_df$index <- rep(1:366, 2)

count_plot_back_ma <- ggplot(race_counts_df, 
                             aes(x = index, y = Freq, group = channel)) +
  geom_vline(xintercept = 145, lty = "dashed", color = "grey") +
  geom_point(aes(color = channel, shape = channel), alpha = .5) +
  geom_smooth(data = subset(race_counts_df, index < 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  geom_smooth(data = subset(race_counts_df, index > 145), aes(color = channel, linetype = channel), 
              se = F, span = .8, size = 1) +
  scale_color_manual(values = c("navy", "darkorange1")) +
  scale_shape_manual(values = c(15, 16)) +
  scale_linetype_manual(values = c("solid", "twodash")) +
  labs(x = "", y = "Count", title = "Daily Counts of Frames\nin Episodes per Network") +
  theme_bw() +
  scale_x_continuous(breaks = c(1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                     minor_breaks = NULL) + 
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(size = 20, hjust = .5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 14),
        strip.text = element_text(size = 20),
        strip.background = element_blank())
count_plot_back_ma

# Exporting to Stata ------------------------------------------------------
count_activ_df <- count_plot_activ_ma$data
prop_activ_df <- prop_plot_activ_ma$data
count_activ_df$df <- "counts"
prop_activ_df$df <- "prop"
count_activ_df$frame <- "activism"
prop_activ_df$frame <- "activism"
dat_activ <- rbind(count_activ_df, prop_activ_df)

dat_activ2 <- reshape(dat_activ, idvar = c("date", "index", "df", "frame"), timevar = "channel", direction = "wide")


count_back_df <- count_plot_back_ma$data
prop_back_df <- prop_plot_back_ma$data
count_back_df$df <- "counts"
prop_back_df$df <- "prop"
count_back_df$frame <- "backlash"
prop_back_df$frame <- "backlash"
dat_back <- rbind(count_back_df, prop_back_df)

dat_back2 <- reshape(dat_back, idvar = c("date", "index", "df", "frame"), timevar = "channel", direction = "wide")

dat_frame <- rbind(dat_activ2, dat_back2)

write.dta(dat_frame, file = "./data/media_frames.dta")

